In [1]:
from itertools import count
from scipy import sparse
from scipy.linalg import norm
from scipy.sparse.linalg import svds
import numpy as np
import timeit
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error
from math import sqrt
import matplotlib.pyplot as plt
%matplotlib inline
import time
import json

In [2]:
n = 300000

In [3]:
raw = np.genfromtxt('train_triplets.txt', max_rows=n, dtype="S40,S18,i8")

In [4]:
'''
Construct COOordinate matrix from raw
'''
row = []
col = []
data = []

user_to_row = {}
song_to_col = {}
user_count = count()
song_count = count()

for i, (uid, sid, play_count) in enumerate(raw):
    if not uid in user_to_row:
        user_to_row[uid] = next(user_count)
    row.append(user_to_row[uid])
    
    if not sid in song_to_col:
        song_to_col[sid] = next(song_count)
    col.append(song_to_col[sid])        
    
    data.append(float(play_count))

arr = np.array(data) # transform list to ndarray for conveninence

In [5]:
'''
Preprocess the data by binning the play counts into b bins 
and store data in sparse coo matrix
'''
b = 10
for i in range(b):
    arr[((arr >= ((1 << i))) & (arr <= ((1 << (i+1)) - 1)))] = (i+1)
arr[arr >= (1 << b)] = b

M = sparse.coo_matrix((arr, (row, col)))

In [6]:
'''
Helper methods for removing users/songs
'''
def removeCol(M):
    M = M.tocsc() #transform to CSC for fast column slicing
    cols = []
    for i in range(M.shape[1]):
        nnz = (M.getcol(i)).count_nonzero()
        if nnz > 5:
            cols.append(i)
    M = M[:, cols]
    return M

def removeRow(M):
    M = M.tocsr() #transform to CSR for fast row slicing
    rows = []
    for i in range(M.shape[0]):
        nnz = (M.getrow(i)).count_nonzero()
        if nnz > 5:
            rows.append(i)
    M = M[rows, :]
    return M

In [7]:
'''
Preprocess the data to avoid the cold start issue. 
That is, recursively remove all users/songs
that have less or equal than 5 occurrences in M, 
until M no longer changes.
'''
prevShape = M.shape
M = removeCol(M)
currShape = M.shape
while prevShape != currShape:
    prevShape = currShape
    M = removeCol(M)
    M = removeRow(M)
    currShape = M.shape

In [8]:
M = M.T # transpose M to have users to columns and songs to rows

In [9]:
'''
To properly evaluate our approach set-aside 200 randomly 
selected non-zero datapoints from the matrix as a test set.
That is, store their values separately and set them to 0 in M.
'''
M = M.tocsr()
nz_row_indices, nz_col_indices = M.nonzero()
nz_indices = np.vstack((nz_row_indices, nz_col_indices))
np.random.shuffle(nz_indices.T)
test_set_indices = nz_indices[:,:200]
M_test = M[test_set_indices[0,:],test_set_indices[1,:]]
M[test_set_indices[0,:],test_set_indices[1,:]] = 0 # set test instances to zero in M

In [10]:
'''
Additionally create a small validation set. That is, set-aside 1000 randomly selected
non-zero data-points and set them to ? in M. You can use this validation set for
hyper-parameter tuning and to judge the convergence of your methods.
'''
nz_row_indices, nz_col_indices = M.nonzero()
nz_indices = np.vstack((nz_row_indices, nz_col_indices))
np.random.shuffle(nz_indices.T)
val_set_indices = nz_indices[:,:1000]
M_val = M[val_set_indices[0,:], val_set_indices[1,:]]
M[val_set_indices[0,:], val_set_indices[1,:]] = 0

In [11]:
# Functions
class Timer:
    def __init__(self):
        self.timers = {}
        self.next_id = 0
    def start(self, msg=""):
        print("START %d-timer on: %s" % (self.next_id, msg))
        self.timers[self.next_id] = [msg, time.time()]
        self.next_id += 1
    def end(self):
        if self.next_id-1 in self.timers:
            end_time = time.time() - self.timers[self.next_id-1][1]
            print("END %d-timer on: %s in %.2f sec" % (
                self.next_id,
                self.timers[self.next_id-1][0],
                end_time))
            self.next_id -= 1
            return end_time
        else:
            print("NO TIMER started")
            return None


#The objective loss
def loss(M, Q, P, nnz_elem, opt):
    err = 0
    reg_x = 0
    reg_i = 0
    for idx in range(nnz_elem.shape[1]): # over all non-zero elements in M
        i = nnz_elem[0,idx]
        j = nnz_elem[1,idx]
        err += (M[i, j] - np.dot(Q[i,:], P[:, j]))**2
        reg_x += norm(P[:, j])**2
        reg_i += norm(Q[i,:])**2
    return err + opt['lambda_1']*reg_x + opt['lambda_2']*reg_i



def validation_loss(val_set_indices, M_val, Q, P, opt):
    err = 0
    reg_x = 0
    reg_i = 0
    for k in range(val_set_indices.shape[1]):
        i = val_set_indices[0,k]
        j = val_set_indices[1,k]
        err += (M_val[0, k] - np.dot(Q[i,:], P[:, j]))**2
        reg_x += norm(P[:, j])**2
        reg_i += norm(Q[i,:])**2
    return err + opt['lambda_1']*reg_x + opt['lambda_2']*reg_i


def test_error(test_set_indices, M_test, Q, P):
    err = 0
    for k in range(test_set_indices.shape[1]):
        i = test_set_indices[0,k]
        j = test_set_indices[1,k]
        err += (M_test[0, k] - np.dot(Q[i,:], P[:, j]))**2
    return sqrt(err*1.0/test_set_indices.shape[1])
    

# Standard Gradient Descent
def GD(M, Q, P, nnz_elem, val_set_indices, M_val, opt, timer=None):
    l=[]
    last_val_loss = np.inf
    last_loss = np.inf
    status = "Maximum steps reached"
    iteration_measured = False
    for step in range(opt['steps']):
        if not iteration_measured:
            timer.start("ITERATION %s" % json.dumps(opt,indent=1))
        delta_P = np.zeros_like(P)
        delta_Q = np.zeros_like(Q)
        for idx in range(nnz_elem.shape[1]):
            i = nnz_elem[0,idx]
            j = nnz_elem[1,idx]
            eij = M[i,j] - np.dot(Q[i,:], P[:,j])
            for k in range(opt['factors']):
                delta_P[k,j] += opt['learning_rate'] * 2 * (eij * Q[i,k] - opt['lambda_1'] * P[k,j])
                delta_Q[i,k] += opt['learning_rate'] * 2 * (eij * P[k,j] - opt['lambda_2'] * Q[i,k])
        P += delta_P
        Q += delta_Q
        if not iteration_measured:
            timer.end()
            iteration_measured = True
        
        if (step) % opt['epoch'] == 0:
            cur_loss = loss(M=M, Q=Q, P=P, nnz_elem=nnz_elem, opt=opt)
            l.append(cur_loss)
            val_loss = validation_loss(val_set_indices=val_set_indices, M_val=M_val, Q=Q, P=P, opt=opt)
            if val_loss > last_val_loss:
                status = "Validation loss increased"
                break
            elif abs(cur_loss - last_loss) < opt['stopping_eps']:
                status = "Training loss converged"
                break
            else:
                last_val_loss = val_loss
                last_loss = cur_loss
            print("After %d steps:\n\tTrainig Loss: %.4lf\n\tValidation Loss: %.4lf" % (step+1, l[-1], val_loss))
    plt.plot(np.arange(len(l)), l)
    return last_val_loss, status
  

'''
Get indices batch of size end - start
'''
def get_batch(nnz_elem, start, end):
    return nnz_elem[:,start:end]


'''
Get count of pairs in a set
Arguments: indices: ndarray
Returns: dict with number of occurences of each unique id found in indices
'''
def get_count(indices):
    unique, counts = np.unique(indices, return_counts=True)
    to_dict = dict(zip(unique, counts))
    return to_dict


def SGD(M, Q, P, nnz_elem, val_set_indices, M_val, opt, timer=None):
    l=[]
    last_val_loss = np.inf
    last_loss = np.inf
    start = 0
    end = 0
    status = "Maximum steps reached"
    iteration_measured = False
    np.random.shuffle(nnz_elem.T)
    for step in range(opt['steps']):
        if not iteration_measured:
            timer.start("ITERATION %s" % json.dumps(opt,indent=1))
        start = end # update bounds for current batch
        end += opt['batch_size']
        delta_P = np.zeros_like(P)
        delta_Q = np.zeros_like(Q)
        batch = get_batch(nnz_elem, start, end) # contains indices for current batch
        for idx in range(batch.shape[1]):
            i = batch[0,idx]
            j = batch[1,idx]
            eij = M[i,j] - np.dot(Q[i,:], P[:,j])
            for k in range(opt['factors']):
                delta_P[k,j] += (get_count(nnz_elem[1])[j]/get_count(batch[1])[j]) * opt['learning_rate'] * 2 * (eij * Q[i,k] - opt['lambda_1'] * P[k,j])
                delta_Q[i,k] += (get_count(nnz_elem[0])[i]/get_count(batch[0])[i]) * opt['learning_rate'] * 2 * (eij * P[k,j] - opt['lambda_2'] * Q[i,k])
        P += delta_P
        Q += delta_Q
        if not iteration_measured:
            timer.end()
            iteration_measured = True
        if (step) % opt['epoch'] == 0:
            cur_loss = loss(M=M, Q=Q, P=P, nnz_elem=nnz_elem, opt=opt)
            l.append(cur_loss)
            val_loss = validation_loss(val_set_indices=val_set_indices, M_val=M_val, Q=Q, P=P, opt=opt)
            if val_loss > last_val_loss:
                status = "Validation loss increased"
                break
            elif abs(cur_loss - last_loss) < opt['stopping_eps']:
                status = "Training loss converged"
                break
            else:
                last_val_loss = val_loss
                last_loss = cur_loss
            print("After %d steps:\n\tTrainig Loss: %.4lf\n\tValidation Loss: %.4lf" % (step+1, l[-1], val_loss))
    plt.plot(np.arange(len(l)), l)
    return last_val_loss, status

def set_optimizer_options(name, learning_rate, lambda_1, lambda_2, steps, epoch, factors, batch_size, stopping_eps):
    options = {
                'optimizer'     : name,
                'learning_rate' : learning_rate,
                'lambda_1'      : lambda_1,
                'lambda_2'      : lambda_2,
                'steps'         : steps,
                'epoch'         : epoch,
                'factors'       : factors,
                'batch_size'    : batch_size,
                'stopping_eps'  : stopping_eps
              }
    return options


def init(M,opt):
    U, S, Vt = svds(M,k=opt['factors'])
    Q = U*S
    P = Vt
    nz_i_indices, nz_x_indices = M.nonzero()
    nz_indices = np.vstack((nz_i_indices, nz_x_indices))
    return Q, P, nz_i_indices, nz_x_indices, nz_indices


'''
def train_hyperparameters(optimizer_func, M, Q, P, nnz_elem, opt, timer):
    timer.start("%s" % json.dumps(opt, indent=1))
    optimizer_func(M,Q,P,nnz_elem,opt)
    timer.end()
    pass
'''


Out[11]:
'\ndef train_hyperparameters(optimizer_func, M, Q, P, nnz_elem, opt, timer):\n    timer.start("%s" % json.dumps(opt, indent=1))\n    optimizer_func(M,Q,P,nnz_elem,opt)\n    timer.end()\n    pass\n'

In [12]:
# Creater timer object
timer = Timer()

c) Learn the optimal P and Q using standard gradient descent. That is, during each iteration you have to use all of the training examples and update Q and P for all users and songs at once.

Approach

store deltas

$$ \Delta_u = \sum\limits_{song\;s\;of\;u} \frac{\partial \mathcal{L}}{\partial p_u}$$$$ \Delta_i = \sum\limits_{user\;u\;of\;i} \frac{\partial \mathcal{L}}{\partial p_i}$$

update feature vectors

$$ p_u = p_u - \Delta_u $$$$ p_i = p_i - \Delta_i $$

In [22]:
# Standard Gradient Descent
opt = set_optimizer_options(name='GradientDescent', 
                                learning_rate=0.0001, 
                                lambda_1=0.5, 
                                lambda_2=0.5, 
                                steps=5, 
                                epoch=1, 
                                factors=30,
                                batch_size=None,
                                stopping_eps=0.001)
Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M=M,opt=opt)
val_loss, status = GD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))


START 0-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": null,
 "optimizer": "GradientDescent",
 "epoch": 1,
 "factors": 30,
 "learning_rate": 0.0001,
 "steps": 5
}
END 1-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": null,
 "optimizer": "GradientDescent",
 "epoch": 1,
 "factors": 30,
 "learning_rate": 0.0001,
 "steps": 5
} in 18.88 sec
After 1 steps:
	Trainig Loss: 10974032.3922
	Validation Loss: 48114.3729
After 2 steps:
	Trainig Loss: 10106190.8349
	Validation Loss: 44602.2570
After 3 steps:
	Trainig Loss: 9326964.6217
	Validation Loss: 41399.9068
After 4 steps:
	Trainig Loss: 8626695.7838
	Validation Loss: 38540.4846
After 5 steps:
	Trainig Loss: 7997024.5068
	Validation Loss: 35914.0402
STATUS: Maximum steps reached
VALIDATION LOSS: 35914.0402
TEST ERROR: 1.7775528850

(d) Learn the optimal P and Q using the original stochastic gradient descent (mini-batches of size 1). That is, during each iteration you sample a single random training example $r_{xi}$ and update only the respective affected parameters $p_x$ and $q_i$.


In [20]:
# Stochastic Gradient Descent
opt = set_optimizer_options(name='StochasticGradientDescent', 
                                learning_rate=0.00009, 
                                lambda_1=0.5, 
                                lambda_2=0.5, 
                                steps=500, 
                                epoch=10,
                                factors=30,
                                batch_size=1,
                                stopping_eps=0.001)
#Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M,opt)
val_loss, status = SGD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))


START 0-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 1,
 "optimizer": "StochasticGradientDescent",
 "epoch": 10,
 "factors": 30,
 "learning_rate": 9e-05,
 "steps": 500
}
END 1-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 1,
 "optimizer": "StochasticGradientDescent",
 "epoch": 10,
 "factors": 30,
 "learning_rate": 9e-05,
 "steps": 500
} in 0.60 sec
After 1 steps:
	Trainig Loss: 28046515279438688.0000
	Validation Loss: 37453381698.8534
After 11 steps:
	Trainig Loss: 28046514727521240.0000
	Validation Loss: 37453381698.7228
After 21 steps:
	Trainig Loss: 28046514728213872.0000
	Validation Loss: 37453381698.6224
After 31 steps:
	Trainig Loss: 28046514728210452.0000
	Validation Loss: 37453381698.4204
After 41 steps:
	Trainig Loss: 28046514818387568.0000
	Validation Loss: 37453381586.5587
After 51 steps:
	Trainig Loss: 28046514665246272.0000
	Validation Loss: 37453381586.5476
STATUS: Validation loss increased
VALIDATION LOSS: 37453381586.5476
TEST ERROR: 1.9966334385

e) Learn the optimal P and Q similarly to d) this time using larger mini-batches of size $S$.


In [15]:
'''
Parameters to SGD with batch size greater than 1.
'''
opt = set_optimizer_options(name='StochasticGradientDescent', 
                                learning_rate=0.0003, 
                                lambda_1=0.5, 
                                lambda_2=0.5, 
                                steps=30, 
                                epoch=5,
                                factors=30,
                                batch_size=100,
                                stopping_eps=0.001)
#Q, P, nz_i_indices, nz_x_indices, nz_indices = init(M,opt)
val_loss, status = SGD(M=M, Q=Q, P=P, nnz_elem=nz_indices, val_set_indices=val_set_indices, M_val=M_val, opt=opt, timer=timer)
err = test_error(test_set_indices=test_set_indices, M_test=M_test, Q=Q, P=P)
print("STATUS: %s\nVALIDATION LOSS: %.4lf\nTEST ERROR: %.10lf" % (status, val_loss, err))


START 0-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 100,
 "optimizer": "StochasticGradientDescent",
 "epoch": 5,
 "factors": 30,
 "learning_rate": 0.0003,
 "steps": 30
}
END 1-timer on: ITERATION {
 "stopping_eps": 0.001,
 "lambda_2": 0.5,
 "lambda_1": 0.5,
 "batch_size": 100,
 "optimizer": "StochasticGradientDescent",
 "epoch": 5,
 "factors": 30,
 "learning_rate": 0.0003,
 "steps": 30
} in 58.69 sec
After 1 steps:
	Trainig Loss: 8393357298.7354
	Validation Loss: 1061097.3375
STATUS: Validation loss increased
VALIDATION LOSS: 1061097.3375
TEST ERROR: 1.9962536052

Report

Gradient Descent

SGD with batch size of 1

SGD with batch size greater than 1

Alternating Least Squares vs. Gradient Descent

Why using ALS ?

  • optimize over the entire loss function
  • transforms a non-convex problem into a convex one

Why (Stochastic) Gradient Descent ?

  • faster, because it optimizes only part of the optimization function.
  • not so good because it does not optimize the entire loss function, but only subset of it.
  • using normal gradient descent gets quite unfeasible on large datasets, if training on the entire dataset every time.

In [ ]: